In [1]:
# !pip install -q -U "tensorflow-text==2.8.*"
# !pip3 install pycocotools
# # conda install -c conda-forge pycocotools
# !pip install tf-models-official==2.7.1
# !pip install torch
# !pip install seaborn
# # !pip install jovian
In [1]:
import os
import shutil

import tensorflow as tf
# import tensorflow_hub as hub
# import tensorflow_text as text
# from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import csv
from sklearn.utils import shuffle
import tensorflow as tf
# import jovian
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import matplotlib
In [2]:
device = torch.device('cpu') 
data_path   = '/Users/andraacsintoae/Desktop/MASTER/2nd year/Sem 2/InfoVisualization/Project/'
train_path = data_path + "netflix_titles.csv"

pd_train = pd.read_csv(train_path)
print(pd_train.head())
  show_id     type  title           director  \
0      s1  TV Show     3%                NaN   
1      s2    Movie   7:19  Jorge Michel Grau   
2      s3    Movie  23:59       Gilbert Chan   
3      s4    Movie      9        Shane Acker   
4      s5    Movie     21     Robert Luketic   

                                                cast        country  \
0  João Miguel, Bianca Comparato, Michel Gomes, R...         Brazil   
1  Demián Bichir, Héctor Bonilla, Oscar Serrano, ...         Mexico   
2  Tedd Chan, Stella Chung, Henley Hii, Lawrence ...      Singapore   
3  Elijah Wood, John C. Reilly, Jennifer Connelly...  United States   
4  Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...  United States   

          date_added  release_year rating   duration  \
0    August 14, 2020          2020  TV-MA  4 Seasons   
1  December 23, 2016          2016  TV-MA     93 min   
2  December 20, 2018          2011      R     78 min   
3  November 16, 2017          2009  PG-13     80 min   
4    January 1, 2020          2008  PG-13    123 min   

                                           listed_in  \
0  International TV Shows, TV Dramas, TV Sci-Fi &...   
1                       Dramas, International Movies   
2                Horror Movies, International Movies   
3  Action & Adventure, Independent Movies, Sci-Fi...   
4                                             Dramas   

                                         description  
0  In a future where the elite inhabit an island ...  
1  After a devastating earthquake hits Mexico Cit...  
2  When an army recruit is found dead, his fellow...  
3  In a postapocalyptic world, rag-doll robots hi...  
4  A brilliant group of students become card-coun...  

After a quick view at the dataset, it looks like a typical movies/shows dataset. We can see that there are NaN values in some columns¶

Data Preparation and Cleaning¶

In [3]:
pd_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7787 entries, 0 to 7786
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       7787 non-null   object
 1   type          7787 non-null   object
 2   title         7787 non-null   object
 3   director      5398 non-null   object
 4   cast          7069 non-null   object
 5   country       7280 non-null   object
 6   date_added    7777 non-null   object
 7   release_year  7787 non-null   int64 
 8   rating        7780 non-null   object
 9   duration      7787 non-null   object
 10  listed_in     7787 non-null   object
 11  description   7787 non-null   object
dtypes: int64(1), object(11)
memory usage: 730.2+ KB

There are 8807 entries and 12 columns. There are a few columns that contain null values ('director', 'cast', 'country', 'date_added', 'rating')¶

In [4]:
pd_train.nunique()
Out[4]:
show_id         7787
type               2
title           7787
director        4049
cast            6831
country          681
date_added      1565
release_year      73
rating            14
duration         216
listed_in        492
description     7769
dtype: int64

We can see that for each of the columns, there are alot different unique values for some of them.¶

In [5]:
#pd_train.drop(columns = ['title','director','cast','country','date_added','release_year','rating','duration','listed_in'], axis =1 , inplace = True)
In [6]:
sns.heatmap(pd_train.isnull(), cbar=False)
plt.title('Null Values Heatmap')
plt.show()

Above in the heatmap and table we can see that there are quite a few null values in the dataset.¶

In [7]:
pd_train.isnull().sum()
Out[7]:
show_id            0
type               0
title              0
director        2389
cast             718
country          507
date_added        10
release_year       0
rating             7
duration           0
listed_in          0
description        0
dtype: int64
In [8]:
print('Number of training sentences: {:,}\n'.format(pd_train.shape[0]))
Number of training sentences: 7,787

In [9]:
pd_train.groupby('type').describe()
Out[9]:
release_year
count mean std min 25% 50% 75% max
type
Movie 5377.0 2012.920030 9.663282 1942.0 2012.0 2016.0 2018.0 2021.0
TV Show 2410.0 2016.191701 5.664826 1925.0 2015.0 2018.0 2019.0 2021.0
In [10]:
df_movie = pd_train[pd_train['type'] == 'Movie']
df_movie.shape
Out[10]:
(5377, 12)
In [11]:
df_tvshow = pd_train[pd_train['type'] == 'TV Show']
df_tvshow.shape
Out[11]:
(2410, 12)
In [12]:
df_movie.sample(2)
Out[12]:
show_id type title director cast country date_added release_year rating duration listed_in description
527 s528 Movie ANIMA Paul Thomas Anderson Thom Yorke United Kingdom June 27, 2019 2019 TV-PG 15 min Dramas, Independent Movies, Music & Musicals In a short musical film directed by Paul Thoma...
1585 s1586 Movie Dana Carvey: Straight White Male, 60 Marcus Raboy Dana Carvey United States November 4, 2016 2016 TV-MA 64 min Stand-Up Comedy Emmy-winning comedian Dana Carvey blends pitch...

Splitting the dataset¶

In this part it is nice to have datasets for both movie and TV show, because we can take a deep dive into just Netflix movie or tv show.¶

In [13]:
dataset_split_moovie = df_movie[pd_train['type']=='Movie'].copy()
dataset_split_moovie.head()
/Applications/anaconda3/envs/PyTorch/lib/python3.7/site-packages/ipykernel_launcher.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  """Entry point for launching an IPython kernel.
Out[13]:
show_id type title director cast country date_added release_year rating duration listed_in description
1 s2 Movie 7:19 Jorge Michel Grau Demián Bichir, Héctor Bonilla, Oscar Serrano, ... Mexico December 23, 2016 2016 TV-MA 93 min Dramas, International Movies After a devastating earthquake hits Mexico Cit...
2 s3 Movie 23:59 Gilbert Chan Tedd Chan, Stella Chung, Henley Hii, Lawrence ... Singapore December 20, 2018 2011 R 78 min Horror Movies, International Movies When an army recruit is found dead, his fellow...
3 s4 Movie 9 Shane Acker Elijah Wood, John C. Reilly, Jennifer Connelly... United States November 16, 2017 2009 PG-13 80 min Action & Adventure, Independent Movies, Sci-Fi... In a postapocalyptic world, rag-doll robots hi...
4 s5 Movie 21 Robert Luketic Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar... United States January 1, 2020 2008 PG-13 123 min Dramas A brilliant group of students become card-coun...
6 s7 Movie 122 Yasir Al Yasiri Amina Khalil, Ahmed Dawood, Tarek Lotfy, Ahmed... Egypt June 1, 2020 2019 TV-MA 95 min Horror Movies, International Movies After an awful accident, a couple admitted to ...
In [14]:
dataset_split_tv = df_tvshow[df_tvshow['type']=='TV Show'].copy()
dataset_split_tv.head()
Out[14]:
show_id type title director cast country date_added release_year rating duration listed_in description
0 s1 TV Show 3% NaN João Miguel, Bianca Comparato, Michel Gomes, R... Brazil August 14, 2020 2020 TV-MA 4 Seasons International TV Shows, TV Dramas, TV Sci-Fi &... In a future where the elite inhabit an island ...
5 s6 TV Show 46 Serdar Akar Erdal Beşikçioğlu, Yasemin Allen, Melis Birkan... Turkey July 1, 2017 2016 TV-MA 1 Season International TV Shows, TV Dramas, TV Mysteries A genetics professor experiments with a treatm...
11 s12 TV Show 1983 NaN Robert Więckiewicz, Maciej Musiał, Michalina O... Poland, United States November 30, 2018 2018 TV-MA 1 Season Crime TV Shows, International TV Shows, TV Dramas In this dark alt-history thriller, a naïve law...
12 s13 TV Show 1994 Diego Enrique Osorno NaN Mexico May 17, 2019 2019 TV-MA 1 Season Crime TV Shows, Docuseries, International TV S... Archival video and new interviews examine Mexi...
16 s17 TV Show Feb-09 NaN Shahd El Yaseen, Shaila Sabt, Hala, Hanadi Al-... NaN March 20, 2019 2018 TV-14 1 Season International TV Shows, TV Dramas As a psychology professor faces Alzheimer's, h...

Data preparation¶

One thing I found is that there appears to be a discrepancy between movies and shows. Movies are based on the duration of the movie and shows are based on the number of seasons.Next I will convert values in these columns into integers for both the movies and shows datasets.¶

In [15]:
dataset_split_moovie = dataset_split_moovie.replace('', 'null').replace(' ', 'null').dropna(subset=["show_id","type","title","director","cast","country","description","date_added","rating","duration"])
print(dataset_split_moovie.head())
  show_id   type  title           director  \
1      s2  Movie   7:19  Jorge Michel Grau   
2      s3  Movie  23:59       Gilbert Chan   
3      s4  Movie      9        Shane Acker   
4      s5  Movie     21     Robert Luketic   
6      s7  Movie    122    Yasir Al Yasiri   

                                                cast        country  \
1  Demián Bichir, Héctor Bonilla, Oscar Serrano, ...         Mexico   
2  Tedd Chan, Stella Chung, Henley Hii, Lawrence ...      Singapore   
3  Elijah Wood, John C. Reilly, Jennifer Connelly...  United States   
4  Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...  United States   
6  Amina Khalil, Ahmed Dawood, Tarek Lotfy, Ahmed...          Egypt   

          date_added  release_year rating duration  \
1  December 23, 2016          2016  TV-MA   93 min   
2  December 20, 2018          2011      R   78 min   
3  November 16, 2017          2009  PG-13   80 min   
4    January 1, 2020          2008  PG-13  123 min   
6       June 1, 2020          2019  TV-MA   95 min   

                                           listed_in  \
1                       Dramas, International Movies   
2                Horror Movies, International Movies   
3  Action & Adventure, Independent Movies, Sci-Fi...   
4                                             Dramas   
6                Horror Movies, International Movies   

                                         description  
1  After a devastating earthquake hits Mexico Cit...  
2  When an army recruit is found dead, his fellow...  
3  In a postapocalyptic world, rag-doll robots hi...  
4  A brilliant group of students become card-coun...  
6  After an awful accident, a couple admitted to ...  
In [16]:
dataset_split_moovie.duration = dataset_split_moovie.duration.str.replace(' min','').astype(int)
dataset_split_tv.rename(columns={'duration':'seasons'}, inplace=True)
dataset_split_tv.replace({'seasons':{'1 Season':'1 Seasons'}}, inplace=True)
dataset_split_tv.seasons = dataset_split_tv.seasons.str.replace(' Seasons','').astype(int)

Exploratory Analysis and Visualization¶

Netflix Film Types: Movie or TV Show¶

It'd be interesting to see the comparison between the total number of movies and shows in this dataset just to get an idea of which one is the majority.

In [17]:
plt.figure(figsize=(7,5))
g = sns.countplot(pd_train.type, palette="pastel");
plt.title("Count of Movies and TV Shows")
plt.xlabel("Type (Movie/TV Show)")
plt.ylabel("Total Count")
plt.show()
/Applications/anaconda3/envs/PyTorch/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
In [18]:
plt.figure(figsize=(12,6))
plt.title("% of Netflix Titles that are either Movies or TV Shows")
g = plt.pie(pd_train.type.value_counts(), explode=(0.025,0.025), labels=pd_train.type.value_counts().index, colors=['skyblue','navajowhite'],autopct='%1.1f%%', startangle=180);
plt.legend()
plt.show()

Netflix Film Ratings¶

Now, we will explore the ratings which are based on the film rating system.

In [19]:
order =  ['G', 'TV-Y', 'TV-G', 'PG', 'TV-Y7', 'TV-Y7-FV', 'TV-PG', 'PG-13', 'TV-14', 'R', 'NC-17', 'TV-MA']
plt.figure(figsize=(15,7))
g = sns.countplot(pd_train.rating, hue=pd_train.type, order=order, palette="pastel");
plt.title("Ratings for Movies & TV Shows")
plt.xlabel("Rating")
plt.ylabel("Total Count")
plt.show()
/Applications/anaconda3/envs/PyTorch/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
In [20]:
fig, ax = plt.subplots(1,2, figsize=(19, 5))
g1 = sns.countplot(pd_train.rating, order=order,palette="Set2", ax=ax[0]);
g1.set_title("Ratings for Movies")
g1.set_xlabel("Rating")
g1.set_ylabel("Total Count")
g2 = sns.countplot(pd_train.rating, order=order,palette="Set2", ax=ax[1]);
g2.set(yticks=np.arange(0,1600,200))
g2.set_title("Ratings for TV Shows")
g2.set_xlabel("Rating")
g2.set_ylabel("Total Count")
fig.show()
/Applications/anaconda3/envs/PyTorch/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/Applications/anaconda3/envs/PyTorch/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/Applications/anaconda3/envs/PyTorch/lib/python3.7/site-packages/ipykernel_launcher.py:11: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
  # This is added back by InteractiveShellApp.init_path()
In [68]:
import plotly.express as px

rating_2021 = pd_train.copy()
rating_2021 = rating_2021.dropna()

fig = px.bar(rating_2021.query("release_year==2020"), x="rating", color="type", title="Ratings for Movies and TV Shows released in 2020")
fig.show()

There is much more content for a more mature audience. For the mature audience, there is much more movie content than there are TV shows.¶

In [21]:
pd_train['year_added'] = pd.DatetimeIndex(pd_train['date_added']).year
dataset_split_moovie['year_added'] = pd.DatetimeIndex(dataset_split_moovie['date_added']).year
dataset_split_tv['year_added'] = pd.DatetimeIndex(dataset_split_tv['date_added']).year
pd_train['month_added'] = pd.DatetimeIndex(pd_train['date_added']).month
dataset_split_moovie['month_added'] = pd.DatetimeIndex(dataset_split_moovie['date_added']).month
dataset_split_tv['month_added'] = pd.DatetimeIndex(dataset_split_tv['date_added']).month

Content added each year¶

Now we will take a look at the amount content Netflix has added throughout the previous years.

In [22]:
import plotly.graph_objects as go

netflix_year = pd_train['year_added'].value_counts().to_frame().reset_index().rename(columns={'index': 'year','year_added':'count'})
# netflix_year = netflix_year[netflix_year.year != 2020]

# sort by  year
netflix_year = netflix_year.sort_values(by=['year'], axis=0)

dict_of_figure = dict({
    "data": [{"type": "bar",
              "x": netflix_year["year"],
              "y": netflix_year["count"]}],
    "layout": {"title": {"text": "Content added on Netflix each year between 2008 and 2021"}}
})

figure = go.Figure(dict_of_figure)

figure.show()
In [23]:
# create two dataframes, one for rows only containing Movie type and the other for rows only containing TV Shows type
dataset_movies = pd_train.apply(lambda values: values[pd_train['type'].isin(['Movie'])])
dataset_TV_shows = pd_train.apply(lambda values: values[pd_train['type'].isin(['TV Show'])])

Movie_month = dataset_movies['month_added'].value_counts().to_frame().reset_index().rename(columns={'index': 'month','month_added':'count'})
TVShows_month = dataset_TV_shows['month_added'].value_counts().to_frame().reset_index().rename(columns={'index': 'month','month_added':'count'})

# sort by month
Movie_month = Movie_month.sort_values(by=['month'], axis=0)
TVShows_month = TVShows_month.sort_values(by=['month'], axis=0)

from plotly.subplots import make_subplots

figure_month = make_subplots(rows=1, cols=2)

figure_month.add_trace(go.Bar(x = Movie_month["month"], y=  Movie_month["count"]), row=1, col=1)
figure_month.add_trace(go.Scatter(x = Movie_month["month"], y=  Movie_month["count"], mode="lines"), row=1, col=1)

figure_month.add_trace(go.Bar(x = TVShows_month["month"], y=  TVShows_month["count"]), row=1, col=2)
figure_month.add_trace(go.Scatter(x = TVShows_month["month"], y= TVShows_month["count"], mode="lines"), row=1, col=2)

figure_month.layout.title = 'Trend of content released each month (left - Movies; right - TV Shows)'

figure_month.show()

In the above graphs we can see that the highest count of both movies and TV Shows has been released on December and Januray, when producers expect people to be in-doors and spending holidays with their families, where one of the well-known activities is watching either Christmas movies or any other type of movies and TV Shows. Moreover, we can see that the lowest number of movies and TV shows released was around summer time, when people are usually spending times outdoors and not so much in front of the TV.¶

In [24]:
month_year_df = pd_train.groupby('year_added')['month_added'].value_counts().unstack().fillna(0).T

plt.figure(figsize=(11,8))
sns.heatmap(month_year_df, linewidths=0.025, cmap="YlGnBu")
plt.title("Content Heatmap")
plt.ylabel("Month")
plt.xlabel("Year")
plt.show()

In the abore heatmap, we can see that around 2014 in when Netflix began to increase their content count.We can see in 2020, the data stops at January since that is the last month available in the dataset.¶

Netflix Film Duration¶

In [25]:
fig, ax = plt.subplots(1,2, figsize=(19, 5))
g1 = sns.distplot(dataset_split_moovie.duration, color='skyblue',ax=ax[0]);
g1.set_xticks(np.arange(0,360,30))
g1.set_title("Duration Distribution for Netflix Movies")
g1.set_ylabel("% of All Netflix Movies")
g1.set_xlabel("Duration (minutes)")
g2 = sns.countplot(dataset_split_tv.seasons, color='skyblue',ax=ax[1]);
g2.set_title("Netflix TV Shows Seasons")
g2.set_ylabel("Count")
g2.set_xlabel("Season(s)")
fig.show()
/Applications/anaconda3/envs/PyTorch/lib/python3.7/site-packages/seaborn/distributions.py:2619: FutureWarning:

`distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).

/Applications/anaconda3/envs/PyTorch/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning:

Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.

/Applications/anaconda3/envs/PyTorch/lib/python3.7/site-packages/ipykernel_launcher.py:11: UserWarning:

Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.

In the above images we can see the duration of Netflix films. On the left side we can see that the duration for Netflix movie closely resembles a normal distribution whith the average viewing time spanning about 90 minutes shich seems to make sense. In the right side in Netflix TV shows seems to be heavily skewed to the right where the majority of shows only have 1 season.¶

Below we can play with the interactive histogram in order to filter TV Shows and Movies by genre and check the number of seasons in a TV Show or the duration of movies by genre type. Moreover, on the last plot we have an overview of duration of Movies and TV Shows by type¶

In [79]:
import plotly.graph_objects as go
from ipywidgets import widgets

# dataset_split_tv.seasons

show_duration = dataset_split_tv.copy()
# dataset_split_moovie
show_duration.listed_in = show_duration.listed_in.str.split(', ', expand=True)[0]
show_duration['listed_in'].unique()

duration = widgets.IntSlider(
           value=min(show_duration.seasons),
           min=min(show_duration.seasons),
           max=max(show_duration.seasons),
           step=1.0,
           description='Number of seasons:',
           continuous_update=False)

interval = widgets.HBox(children=[duration])

dropdown = widgets.Dropdown(
    options=list(show_duration['listed_in'].unique()),
    value='International TV Shows',
    description='Genre:',
)

# Assign an empty figure widget with two traces
trace = go.Histogram(x=show_duration['seasons'], opacity=0.95, name='Number of Series in TV Show')

h = go.FigureWidget(data=[trace])

def validate():
    if dropdown.value in show_duration['listed_in'].unique():
        return True
    else:
        return False

def response(change):
    if validate():

        duration_genre = [i and j for i, j in
                          zip(show_duration['seasons'] <= duration.value, show_duration['listed_in'] == dropdown.value)]
        show_filtered = show_duration[duration_genre]
        x = show_filtered['seasons']
        with h.batch_update():
            h.data[0].x = x
            h.layout.xaxis.title = 'Number of seasons'
            h.layout.yaxis.title = 'Number of TV Shows'


duration.observe(response, names="value")
dropdown.observe(response, names="value")

dropdown_container = widgets.HBox([dropdown])
widgets.VBox([interval,
              dropdown_container,
              h])
VBox(children=(HBox(children=(IntSlider(value=1, continuous_update=False, description='Number of seasons:', ma…
In [78]:
import plotly.graph_objects as go
from ipywidgets import widgets

movie_duration = dataset_split_moovie.copy()
# dataset_split_moovie
movie_duration.listed_in = movie_duration.listed_in.str.split(', ', expand=True)[0]
movie_duration['listed_in'].unique()

# ['Dramas', 'Horror Movies', 'Action & Adventure', 'Documentaries',
#  'Independent Movies', 'Comedies', 'Sci-Fi & Fantasy',
#  'International Movies', 'Children & Family Movies', 'Movies',
#  'Classic Movies', 'Thrillers', 'Stand-Up Comedy', 'Anime Features',
#  'Music & Musicals', 'Cult Movies', 'Romantic Movies',
#  'LGBTQ Movies']

duration = widgets.IntSlider(
           value=min(movie_duration.duration),
           min=min(movie_duration.duration),
           max=max(movie_duration.duration),
           step=1.0,
           description='Duration (in minutes):',
           continuous_update=False)

interval = widgets.HBox(children=[duration])

dropdown = widgets.Dropdown(
    options=list(movie_duration['listed_in'].unique()),
    value='Dramas',
    description='Genre:',
)

# Assign an empty figure widget with two traces
trace = go.Histogram(x=movie_duration['duration'], opacity=0.95, name='Duration of movie')

g = go.FigureWidget(data=[trace])

def validate():
    if dropdown.value in movie_duration['listed_in'].unique():
        return True
    else:
        return False

def response(change):
    if validate():

        duration_genre = [i and j for i, j in
                          zip(movie_duration['duration'] <= duration.value, movie_duration['listed_in'] == dropdown.value)]
        movie_filtered = movie_duration[duration_genre]
        x = movie_filtered['duration']
        with g.batch_update():
            g.data[0].x = x
            g.layout.barmode = 'overlay'
            g.layout.xaxis.title = 'Duration (in minutes)'
            g.layout.yaxis.title = 'Number of Movies'


duration.observe(response, names="value")
dropdown.observe(response, names="value")

dropdown_container = widgets.HBox([dropdown])
widgets.VBox([interval,
              dropdown_container,
              g])
VBox(children=(HBox(children=(IntSlider(value=8, continuous_update=False, description='Duration (in minutes):'…
In [28]:
import plotly.express as px

pd_train = pd_train.replace('', 'null').replace(' ', 'null').dropna(subset=["description"])
pd_train = pd_train.replace('', 'null').replace(' ', 'null').dropna(subset=["country"])

fig = px.scatter(pd_train, x = 'title', y = 'duration', color='type',title='Duration of Netflix movies and TV shows in each country by type')
fig.show()

Countries with the most content available¶

In [29]:
filtered_countries = pd_train.set_index('title').country.str.split(', ', expand=True).stack().reset_index(level=1, drop=True);
filtered_countries = filtered_countries[filtered_countries != 'Country Unavailable']

plt.figure(figsize=(7,9))
g = sns.countplot(y = filtered_countries, order=filtered_countries.value_counts().index[:20])
plt.title('Top 20 Countries on Netflix')
plt.xlabel('Titles')
plt.ylabel('Country')
plt.show()
In [30]:
import plotly.express as px

dataset = pd_train.copy()
# drop columns from dataset that  aren't required for the plot and rows with null values
country_distribution = dataset.drop(columns = {'show_id', 'director', 'cast', 'rating', 'description', 'listed_in', 'date_added', 'duration', 'year_added', 'month_added'}).dropna(axis = 0)

# get all unique countries on the dataset and then select top 20 countries
filtered_countries = pd_train.set_index('title').country.str.split(', ', expand=True).stack().reset_index(level=1, drop=True);
filtered_countries_top = filtered_countries.value_counts().index[:20]

# use lambda function in order to keep only rows with Top 20 content creating countries
country_distribution = country_distribution.apply(lambda values: values[country_distribution['country'].isin(filtered_countries_top)])

data = country_distribution
figure_countries = px.scatter(data, x="release_year", y="country", color="type", title="Overview of the time when top countries entered the Movie and TV Shows industry")

figure_countries.show()

In the above plots we can see that United States and India are in the top when it comes to movies creation and to the time countries began movie creation. Another interesting fact is that, even though United Kingdom comes three in terms of amount of content created, there are countries such as Italy and Egypt that started creating movies way before, in 1950, whereas United Kingdom only entered the industry on 1970. Moreover, there is no surprise that United States was the first country creating TV Shows, given their well-known sitcoms.¶

Popular Genres¶

In [31]:
filtered_genres = pd_train.set_index('title').listed_in.str.split(', ', expand=True).stack().reset_index(level=1, drop=True);

plt.figure(figsize=(7,9))
g = sns.countplot(y = filtered_genres, order=filtered_genres.value_counts().index[:20])
plt.title('Top 20 Genres on Netflix')
plt.xlabel('Count')
plt.ylabel('Genres')
plt.show()
In [32]:
import plotly.express as px

data = pd_train.copy()
figure_genres = px.scatter(data, x="release_year", y="listed_in", color="type", title="Genre overview for Netflix content by type")

figure_genres.show()

Models¶

Further on, we are curious to understand whether there is a correlation between the description of the Netflix Data and the type, i.e. Movie and TV Shows, or the genre. Thus, we will proceed with two experiments:¶

Experiment 1:¶

  • The purpose of the experiment will be to predict Type (Movie or TV Show) based on the Description added to each sample.

Experiment 2:¶

  • The purpose of the experiment will be to predict Genre based on the Description added to each sample.
  • For labeling purposes, where the Movis/TV Shows is listed under multiple Genres, we will select the main Genre and not take into consideration the adiacent ones.

Details:¶

  • For both experiments we will pre-process the Description text data and transform it to a TF-IDF matrix in order to improve our accuracy on prediction.
  • Moreover, we have displayed a 2D and 3D Data Distribution Plot for Experiment 1.We managed to plot the test data by reducing its dimensionality via PCA algorithm and plotted its distribution based on the corresponding labels, 'type' for the first experiment and 'genre' for the second.
In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score
In [197]:
COLORS = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:olive', 'tab:cyan', 'tab:gray', ]
MARKERS = ['o', 'v', 's', '<', '>', '8', '^', 'p', '*', 'h', 'H', 'D', 'd', 'P', 'X']

# plotting 2D function
def plot2d(X, y_pred, y_true, mode=None, centroids=None):
    transformer = None
    X_r = X
    
    if mode is not None:
        transformer = mode(n_components=2)
        X_r = transformer.fit_transform(X)

    assert X_r.shape[1] == 2, 'plot2d only works with 2-dimensional data'


    plt.grid()
    for ix, iyp, iyt in zip(X_r, y_pred, y_true):
        plt.plot(ix[0], ix[1], 
                    c=COLORS[iyp], 
                    marker=MARKERS[iyt])
        
    if centroids is not None:
        C_r = centroids
        if transformer is not None:
            C_r = transformer.fit_transform(centroids)
        for cx in C_r:
            plt.plot(cx[0], cx[1], 
                        marker=MARKERS[-1], 
                        markersize=10,
                        c='red')

    plt.show()
    
# plotting 3D function
def plot3d(X, y_pred, y_true, mode=None, centroids=None):
    transformer = None
    X_r = X
    if mode is not None:
        transformer = mode(n_components=3)
        X_r = transformer.fit_transform(X)

    assert X_r.shape[1] == 3, 'plot3d only works with 3-dimensional data'

    fig = plt.figure()
    ax = fig.add_subplot(projection='3d')
    ax.elev = 30
    ax.azim = 120

    for ix, iyp, iyt in zip(X_r, y_pred, y_true):
        ax.plot(xs=[ix[0]], ys=[ix[1]], zs=[ix[2]], zdir='z',
                    c=COLORS[iyp], 
                    marker=MARKERS[iyt])
        
    if centroids is not None:
        C_r = centroids
        if transformer is not None:
            C_r = transformer.fit_transform(centroids)
        for cx in C_r:
            ax.plot(xs=[cx[0]], ys=[cx[1]], zs=[cx[2]], zdir='z',
                        marker=MARKERS[-1], 
                        markersize=10,
                        c='red')
    plt.show()

# Date preparation function for Data and Labels split on both experiments
def data_preparation(dataset, experiment = "description_type"):
    # 1. Type based on Description
    if experiment == "description_type":
        description_type = dataset.drop(columns = {'show_id', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'year_added', 'month_added'}).dropna(axis = 0)
        
        # Type based on Description => 2 labels
        unique_labels = description_type.type.unique()
        # array(['TV Show', 'Movie'], dtype=object)
        description_type = description_type.replace('TV Show', 0)
        description_type = description_type.replace('Movie', 1)
        
        Data = description_type.description
        Labels = description_type.type
    
    # 2. Genre based on Description
    elif experiment == "description_genre":
        description_genre = dataset.drop(columns = {'show_id', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'type', 'year_added', 'month_added'}).dropna(axis = 0)
        # on Genre field we take only the first genre listed, which is the main one, and set it as the label
        description_genre.listed_in = description_genre.listed_in.str.split(', ', expand=True)[0]
        
        # 3. Genre based on Description => 36 labels
        # ['International TV Shows' 'Dramas' 'Horror Movies' 'Action & Adventure'
        #  'Crime TV Shows' 'Documentaries' 'Independent Movies' 'Comedies'
        #  'Sports Movies' 'Anime Series' 'Reality TV' 'TV Comedies' 'Docuseries'
        #  'Movies' 'British TV Shows' 'International Movies' 'Sci-Fi & Fantasy'
        #  "Kids' TV" 'Children & Family Movies' 'TV Shows' 'Classic Movies'
        #  'Thrillers' 'Stand-Up Comedy & Talk Shows' 'Stand-Up Comedy'
        #  'Anime Features' 'Music & Musicals' 'TV Dramas' 'TV Horror'
        #  'TV Action & Adventure' 'Classic & Cult TV' 'Romantic TV Shows'
        #  'Cult Movies' 'TV Sci-Fi & Fantasy' 'Romantic Movies'
        #  'Spanish-Language TV Shows' 'LGBTQ Movies']
        unique_genre = description_genre.listed_in.unique()
        j = 0
        for genre in unique_genre:
            description_genre = description_genre.replace(genre, j)
            j +=1
        
        Data = description_genre.description
        Labels = description_genre.listed_in
    else:
        print("Please set the experiment to either 'description_type' or 'description_genre'")
    return Data, Labels

Data Preparation¶

Choose the Experiment that should be performed:¶

Experiment 1 = "description_type"  -> Predict type based on description field
Experiment 2 = "description_genre" -> Predict main genre based on description field
In [198]:
dataset_description = pd_train.copy()

Data, Labels = data_preparation(dataset_description, experiment = "description_type")

Preprocess Description Field and transform the description field to TF-IDF matrix¶

In [199]:
tfidf = TfidfVectorizer(token_pattern = r"(?u)\b\w\w+\b", stop_words="english") 
Data_vect = tfidf.fit_transform(Data)

print(Data_vect.shape)

# print the TF-IDF Matrix
# Each word receives a weight computed based on its frequency on a single description and its frenquency overall.
# The more frequent the word is overall, the lower the weight, since its relevance drops on classification from an accuracy perspective
Data_vect = Data_vect.toarray()
print(Data_vect)
(7787, 17905)
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.20814742 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
In [54]:
# take a closer look to the vocabulary
# print(tfidf.vocabulary_)

Data Distribution 2D and 3D Plot¶

Experiment 1: Distribution of data based on type (TV Shows and Movies)¶

Note:

  • Dimensionality reduction of our data was performed using PCA
  • For Experiment 2, given the high number of labels (32 labels) we did not plot the distribution of data
In [201]:
# Displays the data distribution based on the labels.
plot2d(Data_vect, Labels, Labels, PCA)
plot3d(Data_vect, Labels, Labels, PCA)

Split data in training and test data¶

In [181]:
training_data, test_data, training_labels, test_labels = train_test_split(Data_vect, Labels, test_size = 0.2, random_state=42)
In [163]:
# Check how balanced the dataset is
if experiment == "description_type":
    print(f"Number of test data is {len(test_labels)} out of which {len(np.where(test_labels == 0)[0])} TV Shows and {len(np.where(test_labels == 1)[0])} Movies.")
if experiment == "description_genre":
    print(f"Number of test data is {len(test_labels)} out of which {len(np.where(test_labels == 0)[0])} International TV Shows, {len(np.where(test_labels == 1)[0])} Dramas, {len(np.where(test_labels == 2)[0])} Horror Movies etc.")
else:
    print("Please choose the experiment you'd like to perform.")
Number of test data is 1558 out of which 468 TV Shows and 1090 Movies.

Models¶

Random Forest Classifier¶

In [182]:
classifier = RandomForestClassifier(random_state=0)
classifier.fit(training_data, training_labels)

test_predict = classifier.predict(test_data)

acc = accuracy_score(test_labels, test_predict)
print(f"The accuracy is {acc*100}.")
The accuracy is 37.869062901155324.

SVM¶

In [183]:
classifier = svm.SVC(C = 1)  #where C is the regularization parameter; smaller values => stronger regularization but more computing time
classifier.fit(training_data, training_labels)

test_predict = classifier.predict(test_data)

acc = accuracy_score(test_labels, test_predict)
print(f"The accuracy is {acc*100}.")
The accuracy is 32.22079589216945.

Logistic Regression¶

In [187]:
classifier = LogisticRegression(penalty='l2', C = 100)  #where C is the regularization parameter
classifier.fit(training_data, training_labels)

test_predict = classifier.predict(test_data)

acc = accuracy_score(test_labels, test_predict)
print(f"The accuracy is {acc*100}.")
The accuracy is 41.3992297817715.
/Applications/anaconda3/envs/PyTorch/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:818: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG,

Results¶

  • Type based on Description Experiment

    • Random Forest Classifier : The accuracy is 73.81258023106547
    • SVM (C = 1) : The accuracy is 75.1604621309371
    • Logistic Regression(C = 1) : The accuracy is 74.77535301668806
    • Logistic Regression(C = 10): The accuracy is 75.54557124518614
  • Type based on Description Experiment

    • Random Forest Classifier : The accuracy is 37.869062901155324.
    • SVM (C = 1) : The accuracy is 32.22079589216945.
    • Logistic Regression(C = 1) : The accuracy is 40.50064184852375
    • Logistic Regression(C = 10): The accuracy is 41.91270860077022.
  • Observations on Models:
    • Logistic Regression is the best model in terms of trade-off between computing time and accuracy.
    • SVM takes longer to train the model and, as we can see above, the results are similar to the other models.
    • Overall, Logistic Regression had the best performance on both experiments, where C = 10.